1. In-silico PCR

  • Objectives and notes
    • Original coordinates define the location of Single Nucleotide Variants (SNV), centered in an amplicon
    • Run isPCR on GRCh38 genome to define amplicon START and END coordinates
    • Verify primer design specificity
    • Obtain amplicon sequence and calculate GC content
# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE, 
                      warning = FALSE, 
                      error=FALSE, 
                      echo=TRUE, 
                      cache=FALSE, 
                      fig.width = 7, fig.height = 6, 
                      fig.align = 'left')

1.1 GRCh38 in-silico PCR summary

  • 1997 unique primers
  • 2086 isPCR results
  • 39 primer pairs with more than 1 predicted isPCR product.
  • 1958 primer pairs with only 1 isPCR product (Perfect_In_silico_specificity = TRUE)
  • 4 primer pairs without isPCR products
  • 2070 isPCR products with expected size range of 875:925, (Intended_interval = TRUE)
  • 2044 isPCR products within the expected chromosome (Intended_chr = TRUE)
  • 2040 isPCR products with expected product size and chromosome (Intended_amplicon = TRUE)
# Reading in silico PCR results

f_dir <- paste0(wd, "All_chrom_in_silico_PCR_files_GRCh38/")
f_list <- list.files(f_dir)

in_silico_list <- lapply(1:21, function(x) read.csv(paste0(f_dir, f_list[x]), header = TRUE))
df_in_silico <- rbindlist(in_silico_list)

#############

df_in_silico$X <- NULL

# Renaming columns to indicate their coordinate genome version 
colnames(df_in_silico)[10] <- "Chr_GRCh38_pred"
colnames(df_in_silico)[11] <- "Amplicon_Start_GRCh38"
colnames(df_in_silico)[12] <- "Amplicon_End_GRCh38"
colnames(df_in_silico)[13] <- "Amplicon_length_GRCh38"
colnames(df_in_silico)[15] <- "Sequence_GRCh38"

#head(df_in_silico[,1:14])
#dim(df_in_silico)  # 2086 PCR products from 1964 primer pairs.

# Flag amplicons with multiple PCR products
# In silico PCR was run with maxProductSize = 1500 bp.
# Most byproducts have much shorter lengths, making them likely to be packaged into viruses, which has a capacity of 900 bp. 
# In silico prediction of longer products that may not be packaged into AAV due to length constraint may cause reduced efficiency of PCR amplification and decreased abundance of these amplicons. 
# Conclusion: In-silico PCR confirmed high specificity of primer design, and viral packaging. 

# There are 39 amplicons with more than 1 predicted in silico PCR product.
df <- as.data.frame(table(df_in_silico$UNIQID))
unspecific_PCR <- dplyr::filter(df, Freq > 1)  

# Unspecific amplicons and their product sizes
#as.data.frame(dplyr::filter(df_in_silico, UNIQID %in% unspecific_PCR$Var1)[,c("UNIQID", "Amplicon_length_GRCh38", "PCR_efficiency")])

# A histogram of isPCR product lengths.
p1 <- ggplot(df_in_silico, aes(x = Amplicon_length_GRCh38))+
    geom_density()+
    geom_vline(xintercept = median(na.omit(df_in_silico$Amplicon_length_GRCh38)), color = "red", linetype = 2)+
    annotate("text", label = "median = 906 bp", x = 950, y = 0.05, size = 6, colour = "red")+
    theme_bw()+
    labs(title = "A histogram of all isPCR product lengths", x = "Amplicon lengths [bp]")+
    theme(plot.title = element_text(hjust = 0.5))+
    coord_cartesian(xlim = c(800, 1000))


# A histogram of isPCR product lengths predicted as unspecific
p2 <- ggplot(dplyr::filter(df_in_silico, UNIQID %in% unspecific_PCR$Var1), 
       aes(x = Amplicon_length_GRCh38))+
    geom_density()+
    theme_bw()+
    labs(title = "A histogram of unspecific isPCR product lengths", x = "Amplicon lengths [bp]")+
    theme(plot.title = element_text(hjust = 0.5))
plot_grid(p1, p2, 
          labels = c('A', 'B'), 
          rel_widths = c(1.2, 1.2),
          vjust = 1.5)
Histograms of isPCR products. (A) All isPCR products, n = 2086; (B) Unspecific isPCR products, n =  128; Unique primer pairs, n = 1997

Histograms of isPCR products. (A) All isPCR products, n = 2086; (B) Unspecific isPCR products, n = 128; Unique primer pairs, n = 1997

# Conclusions: 
# 1. The majority of unspecific products have lengths compatible with AAV packaging
# 2. There are two PCR products of 1450 bp, and one 78 bp

# Defines Perfect_In_silico_specificity
df_in_silico$Perfect_In_silico_specificity <- ifelse(df_in_silico$UNIQID %in% as.character(unspecific_PCR$Var1), FALSE, TRUE)


# GC content calculation
GC_cont <- function(x){
  
x <- toupper(x)
num_g <- str_count(x, "G")
num_c <- str_count(x, "C")
gc_content <- (num_g + num_c) / str_length(x)
gc_content

}

df_in_silico$GC_content <- GC_cont(df_in_silico$Sequence)


# I'm flagging amplicons if they are intended products if they chromosomes match, and PCR product length is +/- 5 bp from the product predicted on hg19 genome. 

# Fixing X chrom name and chrom class encoding to permit logical operations
# The line below is an ugly hack.
#df_in_silico$CHROMOSOME <- ifelse(is.na(as.numeric(df_in_silico$CHROMOSOME)), 23, as.numeric(df_in_silico$chr))
#df_in_silico$Chr_GRCh38_pred <- as.numeric(df_in_silico$Chr_GRCh38_pred)
#df_in_silico$chr <- as.numeric(df_in_silico$chr)

# Removes 4 records lacking predicted PCR products.
d <- na.omit(df_in_silico)

#unique(setdiff(df_in_silico$UNIQID, d$UNIQID)) 
#No PCR prod for: 
# "1956_2_40385062_C_G_11869.p1", 
# "408_16_78441213_C_G_12937.p1", 
# "2712_3_68059361_A_C_13159.p1", 
# "940_17_35098691_G_A_14586.p1"

# This is different from STAR408, because we don't have product lengths in primer spreadsheet
# Unlike the STAR408 project, in silico PCR is the only source of amplicon sequence length.
# Start and End coordinates denounce SNVs, not amplicons.
# Amplicons were designed to be ~900 bases. 

# I called amplicons "Intended_interval" when they fall between 875 : 925.
d$Intended_interval <- sapply(1:nrow(d), function(x) d$Amplicon_length_GRCh38[x] 
       %in% c(875 : 925))

# Extracting extra amplicons without predicted in silico PCR products
no_PCR_amp <- dplyr::filter(df_in_silico, UNIQID %in% c("1956_2_40385062_C_G_11869.p1", "408_16_78441213_C_G_12937.p1", "2712_3_68059361_A_C_13159.p1", "940_17_35098691_G_A_14586.p1"))

no_PCR_amp$Intended_interval <- NA

# Adding amplicons lacking pred. PCR prod. to the full dataset
d2 <- rbind(d, no_PCR_amp)  

# Checks if the chrom number is as expected
d2$Intended_chr <- d2$CHROMOSOME == d2$Chr_GRCh38_pred

# Intended_amplicon has expected chr and intended interval
# NOTE: consider renaming INtended_interval to Expected_isPCR_length
d2$Intended_amplicon <- d2$Intended_chr & d2$Intended_interval

#2086 - total number of amplicons
#nrow(d2) 

# 2040 -  amplicons meeting Intended_amplicon criteria
#sum(na.omit(d2$Intended_amplicon))

# 1997 - number of unique IDs
# length(unique(d2$UNIQID))   

# Conclusion: some amplicons have multiple isPCR products and match the Intended_amplicon criteria 

#Manual inspection
# 42 spurious amplicons
#dim(dplyr::filter(d2,  Intended_amplicon == FALSE))

# 42 PCR byproducts 
PCR_byproducts <- dplyr::filter(d2,  Intended_amplicon == FALSE) 

#unique(PCR_byproducts$UNIQID)  # 18 UNIQID in PCR_byproducts


# For activity modeling I'll focus only on these with Perfect_In_silico_specificity
isPCR <-  dplyr::filter(d2, Perfect_In_silico_specificity == TRUE)

# There are 1958 of these amplicons. This is a good set I'll be using for lm modeling
#nrow(isPCR)

2. Assay reproducibility

  • Amplicons were realigned to GRCh38 and processed by KC.
  • Read duplicates were removed prior to counting reads/amplicon.
  • GRCh37 was used for primer design and my initial isPCR.
  • Amplicon counts were generated for all ampicons ever made/used in the lab, which allows us to track potential contamination.
  • The V4 and V5 datasets refer to two biological replicates/ injections performed by JL. LSF determined good reproducibility between these two biological replicates in her initial analysis.
#Let's load the V4 and V5 data. This code has been adapted from Linda Su-Feher analysis in 20200116_Combined_Analysis.R

## load v4 + v5 data
## dims: 2576 x 19-22

data.v4 <- read.table("Amplicon_Counts/ASD1KV4_Combined_Amplicons_DupRem.txt", sep = "\t", header = T, quote = "")

data.v5 <- read.table("Amplicon_Counts/ASD1KV5_Combined_Amplicons_DupRem.txt", sep = "\t", header = T, quote = "")
# The two bed files contain dictionaries between GRCh37 and GRCh38 SNV coordinates. I think this was done using Bioconductor liftOver

# Combined_408_ASD30-1000_GRCh38_FORALLELES.bed contains the exact amplicon coordinates for STAR408, ASD30 and ASD100 libraries; and the 1700 bp guesstimates for the ASD1K library, extended 850 bp each direction from SNV coordinates.

# Combined_408_ASD30-1000_GRCh38_unflanked.bed contains SNV coordinates for ASD1K

# This is not necessary because GRCh37 coordinates were later estimated from isPCR (in_silico_expected_chr_GRCh37)

d <- read.table("Coordinates/Combined_408_ASD30-1000_GRCh38_FORALLELES.bed")
e <- read.table("Coordinates/Combined_408_ASD30-1000_GRCh38_unflanked.bed")

# Examples of amplicon ID formats:
# STAR408: "1_CACNA1C or 1UN_CACNA1C", "41_Epilepsy",  UN stands for unique or non-overlapping 
# ASD70: "27_12303.p1_Amp", all have _Amp
# ASD30: "1-13111.p1"  number - patient number p1 or s1
# ASD1k: "1340_3_56717417_A_T_13043.s1"

# A function annotating amplicons by library type
lib_annotation <- function(x){  

x$library <- ifelse(
grepl("^\\d+(UN)?_[A-Za-z0-9]+$", x$V4, perl=TRUE), "STAR408", "")

x$library <- ifelse(
grepl("_Control_", x$V4, perl=TRUE), "STAR408", x$library)

x$library <- ifelse(
grepl("^.*_Amp$", x$V4, perl = TRUE), "ASD70", x$library)

x$library <- ifelse(
grepl("^\\d+-\\d+.[sp]1$", x$V4, perl = TRUE), "ASD30", x$library)

x$library <- ifelse(
grepl("^\\d+_\\d+_\\d+_([ATCG]*)_([ATCG]*)_\\d+.(\\w)\\d$", x$V4, perl = TRUE), "ASD1k", x$library)

colnames(x) <- c("Chr", "START_GRCh38", "END_GRCh38", "UNIQID", "Library")

x
}

#dim(d)
#dim(e) # e has 20 more rows

d <- lib_annotation(d)
e <- lib_annotation(e)

# These are the two missing ASD1k amplicons in d: 
# "931_1_144520557_G_A_14547.p1"
# "2573_9_69219445_T_C_11218.p1"
#setdiff(e$UNIQID, d$UNIQID) 
#setdiff(d$UNIQID, e$UNIQID)

#length(e$UNIQID) #2596
#length(d$UNIQID) #2576

#length(unique(e$UNIQID)) # 2578 There are some duplicated UNIQIDs in e
#length(unique(d$UNIQID)) # 2576

#nrow(filter(e, Library == "ASD1k")) # 1996
#nrow(filter(d, Library == "ASD1k")) # 1994

# Reading in silico PCR GRCh37 from pre-defined chromosomes data - DEPRICATED, delete
# Pre defined chromosomes refers to running isPCR only against a the expected chromosome. 
#in_silico_expected_chr_GRCh37 <- read.csv("G:/Shared drives/Nord Lab - Data/SFRI00/ASD1000/ASD1KV5/Karol/In_silico_PCR/Pre_defined_chrom_in_silico_PCR_GRCh37/Amplicons_w_edge_coordinates_and_seq_pre_def_chrom_GRCh37_1_to_1997.csv")


# There are 1997 unique IDs, but even this chrom pre-determined algorithm found some off target products
#length(unique(df_in_silico$UNIQID))  # 1997
#length((df_in_silico$UNIQID))        # 2086
#
# one missing in the e object
#setdiff(unique(df_in_silico$UNIQID), filter(e, Library == "ASD1k")$UNIQID) #"1576_19_40388570_G_A_14009.s1" is missing
#filter(df_in_silico, UNIQID == "1576_19_40388570_G_A_14009.s1") # This amplicon is predicted to have a product.
# Below is Linda's code from 20200116_Combined_Analysis.R
# It indicates good reproducibility between the V4 and V5 replicates. 
# I modified it adding a filtering step for low represented amplicons with less than 100 counts across DNA samples

## merge data: 2576 x 40
data.merge <- merge(data.v5, data.v4, by = "Amplicon")
data.merge$Undetermined_S0.x <- NULL
data.merge$Undetermined_S0.y <- NULL

# NOTE - add filtering column instead of hard thresholding - consider at later stage in the analysis
# Filtering out amplicons with less than 100 reads at DNA level 
# data.merge <- data.merge[rowSums(data.merge[,grepl("AKg",colnames(data.merge))]) > 100,] # down to 1197 
# Removing all non ASD1k amplicons which may increase the consistency between the V4 and V5 replicates.


## Add some colors to check for contamination 
data.merge <- merge(data.merge, d[,c("UNIQID", "Library")], by.x = "Amplicon", by.y = "UNIQID")
data.merge_ASD1k <- dplyr::filter(data.merge, Library == "ASD1k")

data.merge$Library <- NULL
data.merge_ASD1k$Library <- NULL

# Calculate proportion
amp.prop <- data.merge_ASD1k
rownames(amp.prop) <- amp.prop$Amplicon
amp.prop$Amplicon <- NULL
samples <- colnames(amp.prop)

# Proportion function here: (x+1)/(sum(x)+1)  # +1 is padding
## KC: again, the coding practive below is really bad.
amp.prop <- as.data.frame(apply(amp.prop, 2, function(x) { (x+1)/(sum(x, na.rm = T)+1) }))
amp.prop$Amplicon <- rownames(amp.prop)
amp.prop.plot <- amp.prop
#dim(amp.prop.plot)

2.1 Proportion correlations - 1994 amplicons

2.1.1 DNA linear

# GGpairs diagnostic plots
setwd(wd)

# Proportiona DNA
p1 <- ggpairs(amp.prop.plot[, grepl("Kg|Kv", colnames(amp.prop.plot))], 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

p1

2.1.2 DNA log2

# Proportion log2 DNA
p2 <- ggpairs(log2(amp.prop.plot[, grepl("Kg|Kv", colnames(amp.prop.plot))]), 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

p2

2.1.3 RNA linear

# GGpairs diagnostic plots
setwd(wd)

# Proportiona DNA
p3 <- ggpairs(amp.prop.plot[, grepl("Ko|Ks", colnames(amp.prop.plot))], 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

p3

2.1.4 RNA log2

# GGpairs diagnostic plots
setwd(wd)

# Proportiona DNA
p4 <- ggpairs(log2(amp.prop.plot[, grepl("Ko|Ks", colnames(amp.prop.plot))]), 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

p4

2.1.5 DNA log2 V4

#Let's construct a metadata df
metadata <- data.frame("Sample_name" = c(colnames(data.v4)[2:18], colnames(data.v5)[2:21]))
metadata$Replicate <- ifelse(metadata$Sample_name %in% colnames(data.v4)[2:18], "V4", "V5")

class_of_nucleic_acid <- ifelse(grepl("AKg", metadata$Sample_name), "gDNA", NA)
class_of_nucleic_acid <- ifelse(grepl("AKs|AKo", metadata$Sample_name), "cDNA", class_of_nucleic_acid)
class_of_nucleic_acid <- ifelse(grepl("AKv", metadata$Sample_name), "virus", class_of_nucleic_acid)

metadata$Nucleic_acid <- class_of_nucleic_acid


# Proportion log2 DNA
ggpairs(log2(amp.prop.plot[, 
                           dplyr::filter(metadata, Replicate == "V4", 
                                                   Nucleic_acid %in% c("gDNA", "virus"))$Sample_name]), 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

2.1.6 RNA log2 V4

JT: The initial reverse transcription was done with either polyT primer (odt20, “o”) or with the STAR-rev primer (“s”). The purpose being that the S-primer sets should pre-enrich for our transcripts and have higher concentration of amplify-able cDNA for downstream steps.

KC: I’ll sum the reads of the “o” and “s” samples before calculating proportions. The purpose of combining these RNA samples is to represent biological, not technical variance in the Ratiometric activity.

# Proportion log2 DNA
ggpairs(log2(amp.prop.plot[, 
                           dplyr::filter(metadata, Replicate == "V4", 
                                                   Nucleic_acid %in% c("cDNA"))$Sample_name]), 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

2.1.7 DNA log2 V5

# Proportion log2 DNA
ggpairs(log2(amp.prop.plot[, 
                           dplyr::filter(metadata, Replicate == "V5", 
                                                   Nucleic_acid %in% c("gDNA", "virus"))$Sample_name]), 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

2.1.8 RNA log2 V5

# Proportion log2 DNA
ggpairs(log2(amp.prop.plot[,
                           dplyr::filter(metadata, Replicate == "V5", 
                                                   Nucleic_acid %in% c("cDNA"))$Sample_name]), 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

2.2 PCA

2.2.1 Scree plot

library(plotly)

pca_data <- prcomp(amp.prop.plot[,metadata$Sample_name], center = TRUE, scale = TRUE)

pca_data_var_expl <- pca_data$sdev^2 / sum(pca_data$sdev^2)
scree <- data.frame("PC" = paste0(seq(1:37)), "Var_exp" = pca_data_var_expl)
  
pca_data <- as.data.frame(pca_data$rotation)

pca_data$Sample_name <- rownames(pca_data)
pca_data <- merge(metadata, pca_data, by = "Sample_name")

pca_data$Nucleic_acid <- ifelse(pca_data$Nucleic_acid == "gDNA", "DNA", pca_data$Nucleic_acid)
pca_data$Nucleic_acid <- ifelse(pca_data$Nucleic_acid == "cDNA", "RNA", pca_data$Nucleic_acid)
pca_data$Nucleic_acid <- ifelse(pca_data$Nucleic_acid == "virus", "Virus MaxiPrep", pca_data$Nucleic_acid)

# RT primer type
pca_data_RNA <- dplyr::filter(pca_data, Nucleic_acid == "RNA") 
pca_data_RNA$RT_primer <- ifelse(grepl("o", pca_data_RNA$Sample_name), "OdT20", "STAR-rev")

#pca_data_RNA$Sample_name[grepl("o", pca_data_RNA$Sample_name, "Oligo", "STAR")]
pca_data <- merge(pca_data_RNA[, c("Sample_name", "RT_primer")], pca_data, all = TRUE)
pca_data$RT_primer <- ifelse(is.na(pca_data$RT_primer), "NA", pca_data$RT_primer) 


#head(pca_data)
ggplot(scree[1:5,], aes(x = PC, y = Var_exp))+
  geom_bar(stat = "identity")+
  scale_y_continuous(breaks=seq(0, 1, 0.1))+
  theme_bw()+
  labs(title = "Scree plot", x = "PC", y = "Variance explained")+
  theme(plot.title = element_text(hjust = 0.5))
The first 2 PCs account for 95% of variation

The first 2 PCs account for 95% of variation

2.2.2 PCA DNA + RNA

p <- ggplot(pca_data, aes(x = PC1, y = PC2, color = Nucleic_acid, shape = Replicate, label = Sample_name))+
  geom_point(alpha = 0.7)+
  theme_bw()+
  labs(title = "PC1 vs PC2")+
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(p)

PC2 separates DNA and RNA samples. PC1 separates DNA and RNA samples, with the exception of 3 V4 samples.

p <- ggplot(pca_data, aes(x = PC2, y = PC3, color = Nucleic_acid, shape = Replicate, label = Sample_name))+
  geom_point(alpha = 0.7)+
  theme_bw()+
  labs(title = "PC2 vs PC3")+
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(p)

PC3 separates V4 and V5 RNA samples

2.2.3 PCA - RNA samples only

pca_data <- prcomp(amp.prop.plot[,dplyr::filter(metadata, Nucleic_acid == "cDNA")$Sample_name], center = TRUE, scale = TRUE)

pca_data_var_expl <- pca_data$sdev^2 / sum(pca_data$sdev^2)
scree <- data.frame("PC" = paste0(seq(1:23)), "Var_exp" = pca_data_var_expl)
  
pca_data <- as.data.frame(pca_data$rotation)

pca_data$Sample_name <- rownames(pca_data)
pca_data <- merge(metadata, pca_data, by = "Sample_name")


pca_data$Nucleic_acid <- ifelse(pca_data$Nucleic_acid == "cDNA", "RNA", pca_data$Nucleic_acid)


# RT primer type
pca_data_RNA <- dplyr::filter(pca_data, Nucleic_acid == "RNA") 
pca_data_RNA$RT_primer <- ifelse(grepl("o", pca_data_RNA$Sample_name), "OdT20", "STAR-rev")

#pca_data_RNA$Sample_name[grepl("o", pca_data_RNA$Sample_name, "Oligo", "STAR")]
pca_data <- merge(pca_data_RNA[, c("Sample_name", "RT_primer")], pca_data, all = TRUE)
pca_data$RT_primer <- ifelse(is.na(pca_data$RT_primer), "NA", pca_data$RT_primer) 

pca_data$Biol_rep <- gsub('.*?([0-9]+).*', '\\1', pca_data$Sample_name)
ggplot(scree[1:5,], aes(x = PC, y = Var_exp))+
  geom_bar(stat = "identity")+
  scale_y_continuous(breaks=seq(0, 1, 0.1))+
  theme_bw()+
  labs(title = "Scree plot", x = "", y = "Variance explained")+
  theme(plot.title = element_text(hjust = 0.5))
The first 2 PCs account for 98% of variation

The first 2 PCs account for 98% of variation

p <- ggplot(pca_data, aes(x = PC1, y = PC2, color = RT_primer, shape = Replicate, label = Sample_name))+
  geom_point(alpha = 0.7)+
  theme_bw()+
  labs(title = "PC1 vs PC2")+
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(p)

PC1 and PC2 separates V4 and V5 samples. RT_primer does not separate samples along PC1 or PC2

library(ggrepel)

p <- ggplot(pca_data, aes(x = PC1, y = PC2, color = Biol_rep, shape = Replicate, label = Sample_name))+
  geom_point(alpha = 0.7)+
  theme_bw()+
  labs(title = "PC1 vs PC2")+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text_repel()


p
Clustering of color-coded technical replicates, PC1 vs PC2

Clustering of color-coded technical replicates, PC1 vs PC2

library(ggrepel)

p <- ggplot(pca_data, aes(x = PC2, y = PC3, color = Biol_rep, shape = Replicate, label = Sample_name))+
  geom_point(alpha = 0.7)+
  theme_bw()+
  labs(title = "PC2 vs PC3")+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text_repel()


p
Clustering of color-coded technical replicates, PC2 vs PC3

Clustering of color-coded technical replicates, PC2 vs PC3

2.3 Combining technical replicates

# Sum counts of technical replicates

df <- data.merge_ASD1k
data.merge_ASD1k_original <- df

# d = DNA, r = RNA, v = virus
data.merge_ASD1k <- data.frame("Amplicon" = df$Amplicon,
           #V4
           "AKd49" = df$AKg49_S40,
           "AKd50" = df$AKg50_S41,
           "AKd51" = df$AKg51_S42,
           "AKd52" = df$AKg52_S43,
           "AKd53" = df$AKg53_S44,
           
           "AKr49" = c(df$AKo49_S50 + df$AKs49_S45),   
           "AKr50" = c(df$AKo50_S51 + df$AKs50_S46),
           "AKr51" = c(df$AKo51_S52 + df$AKs51_S47),
           "AKr52" = c(df$AKo52_S53 + df$AKs52_S48),
           "AKr53" = c(df$AKo53_S54 + df$AKs53_S49),
           
           "AKv01" = df$AKv01_S56,
           "AKv02" = df$AKv02_S55,
           
           #V5
           "AKd54" = df$AKg54_S83,
           "AKd55" = df$AKg55_S84,
           "AKd56" = df$AKg56_S85,
           "AKd57" = df$AKg57_S86,
           "AKd69" = df$AKg69_S87,
           "AKd70" = df$AKg70_S88,
           "AKd71" = df$AKg71_S89,
           
           "AKr54" = df$AKs54_S90,
           "AKr55" = c(df$AKs55b_S97 + df$AKs55c_S100 + df$AKs55_S91),
           "AKr56" = c(df$AKs56b_S98 + df$AKs56c_S101 + df$AKs56_S92),
           "AKr57" = df$AKs57_S93,
           "AKr69" = df$AKs69_S94,
           "AKr70" = df$AKs70_S95,
           "AKr71" = c(df$AKs71b_S99 + df$AKs71c_S102 + df$AKs71_S96)
           
           )

metadata_original <- metadata

metadata <- data.frame("Sample_name" = colnames(data.merge_ASD1k)[2:27],
                       "Replicate" = c(rep("V4", 12), rep("V5", 14)),
                       "Nucleic_acid" = c(rep("DNA", 5), rep("RNA", 5), 
                                          rep("virus", 2), rep("DNA", 7), rep("RNA", 7))) 

2.3.1 Sequencing depth

seq_depth <- data.frame(
  "Sample_name" = c("AKd49", "AKd50", "AKd51", "AKd52", "AKd53",
                    "AKr49", "AKr49", "AKr50", "AKr50", "AKr51", "AKr51", "AKr52", "AKr52", "AKr53", "AKr53",
                    "AKv01", "AKv02",
                    "AKd54", "AKd55", "AKd56", "AKd57", "AKd69", "AKd70", "AKd71",
                    "AKr54", "AKr55", "AKr55", "AKr55", "AKr56", "AKr56", "AKr56",
                    "AKr57",  "AKr69", "AKr70",
                    "AKr71", "AKr71", "AKr71"),
  
  "Sample_name_original" = metadata_original$Sample_name,
  "Nucleic_acid" = c(rep("DNA", 5), rep("RNA", 10), rep("virus", 2), rep("DNA", 7), rep("RNA", 13)),
  "Read_depth" = colSums(data.merge_ASD1k_original[,2:38]))
                    

ggplot(seq_depth, aes(x = Sample_name, y = Read_depth / 10^6, 
                      color = Sample_name_original, fill = Nucleic_acid))+
  geom_bar(position = "stack", stat = "identity")+
  theme_bw()+
  labs(title = "Read depth", x = " Sample", "Reads [millions]")+
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

2.4 Proportion correlations - comb. rep.

amp.prop.plot_original <- amp.prop.plot


# Calculate proportion
amp.prop <- data.merge_ASD1k
rownames(amp.prop) <- amp.prop$Amplicon
amp.prop$Amplicon <- NULL
samples <- colnames(amp.prop)

# Proportion function here: (x+1)/(sum(x)+1)  # +1 is padding
amp.prop <- as.data.frame(apply(amp.prop, 2, function(x) { (x+1)/(sum(x, na.rm = T)+1) }))
amp.prop$Amplicon <- rownames(amp.prop)
amp.prop.plot <- amp.prop

2.4.1 DNA log2

# Proportion log2 DNA
ggpairs(log2(amp.prop.plot[, grepl("Kd|Kv", colnames(amp.prop.plot))]), 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

2.4.2 RNA log2

# Proportion log2 RNA
ggpairs(log2(amp.prop.plot[, grepl("Kr", colnames(amp.prop.plot))]), 
      lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                   combo = "box", discrete = "blank", na = "na"),
      diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
      upper = list(continuous = wrap('cor', size = 3)))+ 
      theme_bw()+ 
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(), 
            text = element_text(size = 8), 
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

3. Ratiometric activity

## Manually calculate ratiometric activity (RNA/DNA)
amp.act <- data.frame("amp_id" = rownames(amp.prop.plot), "Color" = "ASD1k")

## manually define ratiometric activity for V5
amp.act$Ratio.V5.54 <- (amp.prop.plot$AKr54) / (amp.prop.plot$AKd54)
amp.act$Ratio.V5.55 <- (amp.prop.plot$AKr55) / (amp.prop.plot$AKd55)
amp.act$Ratio.V5.56 <- (amp.prop.plot$AKr56) / (amp.prop.plot$AKd56)
amp.act$Ratio.V5.57 <- (amp.prop.plot$AKr57) / (amp.prop.plot$AKd57)
amp.act$Ratio.V5.69 <- (amp.prop.plot$AKr69) / (amp.prop.plot$AKd69)
amp.act$Ratio.V5.70 <- (amp.prop.plot$AKr70) / (amp.prop.plot$AKd70)
amp.act$Ratio.V5.71 <- (amp.prop.plot$AKr71) / (amp.prop.plot$AKd71)


## manually define ratiometric activity for V4
amp.act$Ratio.V4.49 <- (amp.prop.plot$AKr49) / (amp.prop.plot$AKd49)
amp.act$Ratio.V4.50 <- (amp.prop.plot$AKr50) / (amp.prop.plot$AKd50)
amp.act$Ratio.V4.51 <- (amp.prop.plot$AKr51) / (amp.prop.plot$AKd51)
amp.act$Ratio.V4.52 <- (amp.prop.plot$AKr52) / (amp.prop.plot$AKd52)
amp.act$Ratio.V4.53 <- (amp.prop.plot$AKr53) / (amp.prop.plot$AKd53)

3.1 Correlation plots - 1994 amplicons

3.1.1 log2

# Activity sample corelation plots 
## Linear
p <- ggpairs(log2(amp.act[, grepl("Ratio", colnames(amp.act))]), 
             lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 1), 
                          combo = "box", discrete = "blank", na = "na"),
             diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), 
                         na = "blankDiag"), 
             upper = list(continuous = wrap('cor', size = 3))) + 
             theme_bw()+ 
             theme(panel.grid.major = element_blank(), 
                   panel.grid.minor = element_blank(), 
                   text = element_text(size = 10), 
                   axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

p

4. Mean ratiometric activity and proportions

#Activity Mean and SD
library(genefilter)
library(reshape2)
library(data.table)

amp.act_summary <- amp.act[,1:2]

amp.act_summary$Mean_V5 <- rowMeans(amp.act[,c(3:9)])
amp.act_summary$Mean_V4 <- rowMeans(amp.act[,c(10:14)])
amp.act_summary$Mean_All <- rowMeans(amp.act[,c(3:14)])

amp.act_summary$SD_V5 <- rowSds(as.matrix(amp.act[,c(3:9)]))
amp.act_summary$SD_V4 <- rowSds(as.matrix(amp.act[,c(10:14)]))
amp.act_summary$SD_All <- rowSds(as.matrix(amp.act[,c(3:14)]))



# Proportions
#
#colnames(amp.prop.plot)
#colnames(data.v4)
#colnames(data.v5)


V4_DNA_samples <- dplyr::filter(metadata, Replicate == "V4", Nucleic_acid == "DNA")$Sample_name
V4_RNA_samples <- dplyr::filter(metadata, Replicate == "V4", Nucleic_acid == "RNA")$Sample_name
V5_DNA_samples <- dplyr::filter(metadata, Replicate == "V5", Nucleic_acid == "DNA")$Sample_name
V5_RNA_samples <- dplyr::filter(metadata, Replicate == "V5", Nucleic_acid == "RNA")$Sample_name

DNA_samples <- dplyr::filter(metadata, Nucleic_acid == "DNA")$Sample_name
RNA_samples <- dplyr::filter(metadata, Nucleic_acid == "RNA")$Sample_name


amp.prop.plot$Color <- "ASD1k"

amp.prop_summary <- amp.prop.plot[,c("Amplicon", "Color")]
rownames(amp.prop_summary) <- NULL

#head(amp.prop_summary)


amp.prop_summary$Mean_DNA <- rowMeans(amp.prop.plot[,DNA_samples])
amp.prop_summary$Mean_RNA <- rowMeans(amp.prop.plot[,RNA_samples])
amp.prop_summary$Mean_DNA_V4 <- rowMeans(amp.prop.plot[,V4_DNA_samples])
amp.prop_summary$Mean_RNA_V4 <- rowMeans(amp.prop.plot[,V4_RNA_samples])
amp.prop_summary$Mean_DNA_V5 <- rowMeans(amp.prop.plot[,V5_DNA_samples])
amp.prop_summary$Mean_RNA_V5 <- rowMeans(amp.prop.plot[,V5_RNA_samples])

amp.prop_summary$SD_DNA <- rowSds(amp.prop.plot[,DNA_samples])
amp.prop_summary$SD_RNA <- as.numeric(rowSds(amp.prop.plot[,RNA_samples]))
amp.prop_summary$SD_DNA_V4 <- as.numeric(rowSds(amp.prop.plot[,V4_DNA_samples]))
amp.prop_summary$SD_RNA_V4 <- as.numeric(rowSds(amp.prop.plot[,V4_RNA_samples]))
amp.prop_summary$SD_DNA_V5 <- as.numeric(rowSds(amp.prop.plot[,V5_DNA_samples]))
amp.prop_summary$SD_RNA_V5 <- as.numeric(rowSds(amp.prop.plot[,V5_RNA_samples]))


# Applying a threshold on proportions
amp.prop_summary$Pass_prop_DNA_and_RNA <- ifelse(amp.prop_summary$Mean_DNA > 2^-15 & 
                                                     amp.prop_summary$Mean_RNA > 2^-15, TRUE, FALSE)


## Work zone
## Add Boolean columns denoting min count tests
 min_count = 100
 min_count_DNA <- rowSums(data.merge_ASD1k[,DNA_samples]>min_count) >= ncol(data.merge_ASD1k[,DNA_samples])
 min_count_DNA_V4 <- rowSums(data.merge_ASD1k[,V4_DNA_samples]>min_count) >= ncol(data.merge_ASD1k[,V4_DNA_samples])
 min_count_DNA_V5 <- rowSums(data.merge_ASD1k[,V5_DNA_samples]>min_count) >= ncol(data.merge_ASD1k[,V5_DNA_samples])
 
 min_count_RNA <- rowSums(data.merge_ASD1k[,RNA_samples]>min_count) >= ncol(data.merge_ASD1k[,RNA_samples])
 min_count_RNA_V4 <- rowSums(data.merge_ASD1k[,V4_RNA_samples]>min_count) >= ncol(data.merge_ASD1k[,V4_RNA_samples])
 min_count_RNA_V5 <- rowSums(data.merge_ASD1k[,V5_RNA_samples]>min_count) >= ncol(data.merge_ASD1k[,V5_RNA_samples])
 
 count_Booleans <- data.frame(
   "Amplicon" = data.merge_ASD1k$Amplicon,
   "min_count_DNA" = min_count_DNA,
   "min_count_DNA_V4" = min_count_DNA_V4,
   "min_count_DNA_V5" =  min_count_DNA_V5,
   
   "min_count_RNA" = min_count_RNA,
   "min_count_RNA_V4" = min_count_RNA_V4,
   "min_count_RNA_V5" =  min_count_RNA_V5
 )

 
# Building a comprehensive data frame
 y <- merge(amp.prop_summary, count_Booleans, by = "Amplicon") # 1994 rows, Only ASD1k amplicons
 
 y_ASD1k <- dplyr::filter(y, Color == "ASD1k") # 1994 rows
library(ggplot2)
library(genefilter)
library(ggpmisc)

# A rectangle delinating DNA and RNA proportion thresholds < 2^-15
d=data.frame(x1=c(-Inf, -Inf), x2=c(+Inf, -15), y1=c(-Inf,-15), y2=c(-15, +Inf))


# Passing min DNA counts on all samples
p1 <- ggplot()+
  geom_point(data = y_ASD1k, aes(x = log2(Mean_DNA), 
                                 y = log2(Mean_RNA), 
                                 color = min_count_DNA), alpha = 0.3)+
  theme_bw()+
  geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
  labs(title = "Amplicons with > 100 counts in all DNA")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position="bottom")


p2 <- ggplot()+
  geom_point(data = y_ASD1k, aes(x = log2(Mean_DNA), y = log2(Mean_RNA), 
                                 color = min_count_DNA_V4), alpha = 0.3)+
  theme_bw()+
  geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
  labs(title = "Amplicons with > 100 counts in V4 replicate DNA")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position="bottom")



p3 <- ggplot()+
  geom_point(data = y_ASD1k, aes(x = log2(Mean_DNA), y = log2(Mean_RNA), 
                                 color = min_count_DNA_V5), alpha = 0.3)+
  theme_bw()+
  geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
  labs(title = "Amplicons with > 100 counts in V5 replicate DNA")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position="bottom")





# Passing min DNA counts on all samples
y$min_count_DNA_and_RNA <- y$min_count_RNA & y$min_count_DNA
y_ASD1k$min_count_DNA_and_RNA <- y_ASD1k$min_count_RNA & y_ASD1k$min_count_DNA

p4 <- ggplot()+
  geom_point(data = y_ASD1k, aes(x = log2(Mean_DNA), y = log2(Mean_RNA), 
                                 color = min_count_DNA_and_RNA), alpha = 0.3)+
  theme_bw()+
  geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
  labs(title = "Amplicons with > 100 counts in DNA and RNA samples")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position="bottom")


plot_grid(p1, p2, p3, p4, 
          labels = c("A", "B", "C", "D"), 
          rel_widths = c(1.2, 1.2),
          vjust = 0.9)

### Dataset for lm ###
# Filtering criteria:
#  1. > 100 reads across all DNA samples, and > 100 across all RNA samples
#  2. Mean DNA proportions and mean RNA proportions > 2^-15

y$lm_dataset <- y$min_count_DNA_and_RNA & y$Pass_prop_DNA_and_RNA
y_ASD1k$lm_dataset <- y_ASD1k$min_count_DNA_and_RNA & y_ASD1k$Pass_prop_DNA_and_RNA

p1 <- ggplot()+
  geom_point(data = y_ASD1k, aes(x = log2(Mean_DNA), y = log2(Mean_RNA), color = lm_dataset), alpha = 0.3)+
  theme_bw()+
  geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+
  labs(title = "Dataset cleaning for building the lm model")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position="bottom")



## plotting lm dataset ##
y_ASD1k_lm <- dplyr::filter(y_ASD1k, lm_dataset == TRUE)

formula <- y ~ x

p2 <- ggplot(y_ASD1k_lm, aes(x = log2(Mean_DNA), y = log2(Mean_RNA)))+
  geom_point(alpha = 0.3)+
  theme_bw()+
  geom_abline(intercept = 0, color = "red", linetype = 2)+
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  stat_poly_eq(formula = formula, 
                aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
                parse = TRUE)+
 labs(title = "Simple lm fit to the cleaned data, n = 748 amplicons")+
 theme(plot.title = element_text(hjust = 0.5),
        legend.position="bottom")


plot_grid(p1, p2,
          labels = c("A", "B"), 
          rel_widths = c(1.2, 1.2),
          vjust = 0.9)

# save.image("G:/Shared drives/Nord Lab - Data/SFRI00/ASD1000/ASD1KV5/Karol/ASD1k_lm_10_07_2020.RData")
# load("G:/Shared drives/Nord Lab - Data/SFRI00/ASD1000/ASD1KV5/Karol/ASD1k_lm_10_07_2020.RData")
# setwd("G:/Shared drives/Nord Lab - Data/SFRI00/ASD1000/ASD1KV5/Karol/")

5. GC content

# I'm using GC values from in silico PCR prediction, limiting the subset of amplicons to those with only 1 predicted product/
#filter(d2, Perfect_In_silico_specificity == TRUE)

# Merging GC dataset with perfect in silico specificity reduced amplicon number from 748 to 730. Not much loss!
y_ASD1k_lm_GC <- merge(y_ASD1k_lm, dplyr::filter(d2, Perfect_In_silico_specificity == TRUE)[,c(1, 17)], by.x = "Amplicon", by.y = "UNIQID")

# 727 amplicons after removing 3 without isPCR sequence
y_ASD1k_lm_GC <- na.omit(y_ASD1k_lm_GC)

5.1 Plotting and modeling data with GC content

library(ggplot2)

formula <- y ~ x

y_ASD1k_lm_GC$GC_ab_mean <- ifelse(y_ASD1k_lm_GC$GC_content > mean(na.omit(y_ASD1k_lm_GC$GC_content)), TRUE, FALSE)

#y_ASD1k_lm_GC[-c(252, 499, 576),] # Removes rows containing GC values
# na.omit(y_ASD1k_lm_GC)  - the same, just simpler

ggplot(na.omit(y_ASD1k_lm_GC), 
       aes(x = log2(Mean_DNA), y = log2(Mean_RNA), 
           color = GC_content))+
  geom_point(alpha = 0.5)+
  theme_bw()+
  geom_abline(intercept = 0, color = "red", linetype = 2)+
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  stat_poly_eq(formula = formula, 
                aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
                parse = TRUE)+
  scale_color_gradient2(midpoint = mean(na.omit(y_ASD1k_lm_GC$GC_content)), low="blue3", mid="white",
                     high="red3", space ="Lab" )+
  facet_wrap(~ GC_ab_mean)+
  labs(title = "DNA vs RNA proportions, split at mean GC content")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position="bottom")
There seems to be limited if any correlation of GC content and amplicon activity

There seems to be limited if any correlation of GC content and amplicon activity

# data.all$MeanRatio_MoM <- data.all$RNA_prop_mean / data.all$DNA_prop_mean

y_ASD1k_lm_GC$MeanRatio_All <- y_ASD1k_lm_GC$Mean_RNA / y_ASD1k_lm_GC$Mean_DNA
y_ASD1k_lm_GC$MeanRatio_V4 <- y_ASD1k_lm_GC$Mean_RNA_V4/ y_ASD1k_lm_GC$Mean_DNA_V4
y_ASD1k_lm_GC$MeanRatio_V5 <- y_ASD1k_lm_GC$Mean_RNA_V5/ y_ASD1k_lm_GC$Mean_DNA_V5    


formula <- y ~ x

GC_cont_plot <- function(x, y){
    ggplot(y_ASD1k_lm_GC, aes(x = log2(get(x)), y = GC_content))+
    geom_point(alpha = 0.5)+
    theme_bw()+
    geom_smooth(formula = formula, method = 'lm', color = "black")+
    stat_poly_eq(formula = formula, 
                aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
                parse = TRUE)+
    labs(title = y, x = "log2(mean DNA prop. / mean RNA prop.)")+    
    theme(plot.title = element_text(hjust = 0.5, size = 14),
        text = element_text(size = 12))    
    }



plot_grid(GC_cont_plot("MeanRatio_All", "V4 + V5 combined"),
          GC_cont_plot("MeanRatio_V4", "V4"),
          GC_cont_plot("MeanRatio_V5", "V5"),
          nrow = 1,
          labels = c("A", "B", "C"), 
          vjust = 0.9)
V4 and V5 replicates demonstrate oposing GC bias, but the correlation magnitude is small

V4 and V5 replicates demonstrate oposing GC bias, but the correlation magnitude is small

5.2 GC content impact on amp. act.

lm_all_GC <- lm(log2(Mean_RNA) ~ log2(Mean_DNA) + GC_content, data = y_ASD1k_lm_GC)
lm_V4_GC <- lm(log2(Mean_RNA_V4) ~ log2(Mean_DNA_V4) + GC_content, data = y_ASD1k_lm_GC) 
lm_V5_GC <- lm(log2(Mean_RNA_V5) ~ log2(Mean_DNA_V5) + GC_content, data = y_ASD1k_lm_GC)  

lm_all <- lm(log2(Mean_RNA) ~ log2(Mean_DNA), data = y_ASD1k_lm_GC)
lm_V4 <- lm(log2(Mean_RNA_V4) ~ log2(Mean_DNA_V4), data = y_ASD1k_lm_GC) 
lm_V5 <- lm(log2(Mean_RNA_V5) ~ log2(Mean_DNA_V5), data = y_ASD1k_lm_GC)  

# Add AIC and BIC
lm_all_GC$AIC <- round(AIC(lm_all_GC), 1)
lm_V4_GC$AIC <- round(AIC(lm_V4_GC), 1)
lm_V5_GC$AIC <- round(AIC(lm_V5_GC), 1)

lm_all$AIC <- round(AIC(lm_all), 1)
lm_V4$AIC <- round(AIC(lm_V4), 1)
lm_V5$AIC <- round(AIC(lm_V5), 1)

lm_all_GC$BIC <- round(BIC(lm_all_GC), 1)
lm_V4_GC$BIC <- round(BIC(lm_V4_GC), 1)
lm_V5_GC$BIC <- round(BIC(lm_V5_GC), 1)

lm_all$BIC <- round(BIC(lm_all), 1)
lm_V4$BIC <- round(BIC(lm_V4), 1)
lm_V5$BIC <- round(BIC(lm_V5), 1)

5.2.1 V4 + V5

summary(lm_all)
## 
## Call:
## lm(formula = log2(Mean_RNA) ~ log2(Mean_DNA), data = y_ASD1k_lm_GC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0042 -0.6142  0.1011  0.6837  2.6126 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.69457    0.34476  -2.015   0.0443 *  
## log2(Mean_DNA)  0.95075    0.03388  28.063   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9968 on 807 degrees of freedom
## Multiple R-squared:  0.4939, Adjusted R-squared:  0.4933 
## F-statistic: 787.5 on 1 and 807 DF,  p-value: < 2.2e-16
summary(lm_all_GC)
## 
## Call:
## lm(formula = log2(Mean_RNA) ~ log2(Mean_DNA) + GC_content, data = y_ASD1k_lm_GC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9511 -0.6158  0.0665  0.6938  2.5169 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.99635    0.45150  -2.207   0.0276 *  
## log2(Mean_DNA)  0.94105    0.03515  26.772   <2e-16 ***
## GC_content      0.44202    0.42705   1.035   0.3010    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9967 on 806 degrees of freedom
## Multiple R-squared:  0.4946, Adjusted R-squared:  0.4933 
## F-statistic: 394.3 on 2 and 806 DF,  p-value: < 2.2e-16

5.2.2 V4

summary(lm_V4)
## 
## Call:
## lm(formula = log2(Mean_RNA_V4) ~ log2(Mean_DNA_V4), data = y_ASD1k_lm_GC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0562 -0.8193  0.1170  0.9225  3.5783 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.29443    0.48942   4.688 3.24e-06 ***
## log2(Mean_DNA_V4)  1.28023    0.04827  26.520  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.338 on 807 degrees of freedom
## Multiple R-squared:  0.4657, Adjusted R-squared:  0.465 
## F-statistic: 703.3 on 1 and 807 DF,  p-value: < 2.2e-16
summary(lm_V4_GC)
## 
## Call:
## lm(formula = log2(Mean_RNA_V4) ~ log2(Mean_DNA_V4) + GC_content, 
##     data = y_ASD1k_lm_GC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9250 -0.7470  0.1043  0.8818  3.0027 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.95855    0.59734  -1.605    0.109    
## log2(Mean_DNA_V4)  1.17606    0.04766  24.675   <2e-16 ***
## GC_content         4.78077    0.54564   8.762   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.28 on 806 degrees of freedom
## Multiple R-squared:  0.5121, Adjusted R-squared:  0.5109 
## F-statistic: 423.1 on 2 and 806 DF,  p-value: < 2.2e-16

5.2.3 V5

summary(lm_V5)
## 
## Call:
## lm(formula = log2(Mean_RNA_V5) ~ log2(Mean_DNA_V5), data = y_ASD1k_lm_GC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7426 -0.6260  0.0790  0.7033  2.5238 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.44737    0.32364  -7.562 1.08e-13 ***
## log2(Mean_DNA_V5)  0.76735    0.03167  24.226  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9916 on 807 degrees of freedom
## Multiple R-squared:  0.421,  Adjusted R-squared:  0.4203 
## F-statistic: 586.9 on 1 and 807 DF,  p-value: < 2.2e-16
summary(lm_V5_GC)
## 
## Call:
## lm(formula = log2(Mean_RNA_V5) ~ log2(Mean_DNA_V5) + GC_content, 
##     data = y_ASD1k_lm_GC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9241 -0.6089  0.0848  0.6902  2.4878 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.41845    0.43066  -3.294 0.001032 ** 
## log2(Mean_DNA_V5)  0.79983    0.03272  24.443  < 2e-16 ***
## GC_content        -1.51765    0.42300  -3.588 0.000354 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9844 on 806 degrees of freedom
## Multiple R-squared:  0.4301, Adjusted R-squared:  0.4287 
## F-statistic: 304.2 on 2 and 806 DF,  p-value: < 2.2e-16

5.2.4 Summary of the models

library(stargazer)

stargazer(lm_all, lm_all_GC, lm_V4, lm_V4_GC, lm_V5, lm_V5_GC, 
          title="Regression Results",
          single.row = TRUE, 
          font.size = "huge",
          dep.var.labels.include = TRUE,
         
          column.labels=c("V4 + V5", "V4 + V5 w GC", "V4", "V4 w GC", "V5", "V5 w GC"),
          align=TRUE, 
          type = "html", 
          keep.stat=c("aic", "bic", "rsq", "n"),
          report=('vc*p'))
Regression Results
Dependent variable:
log2(Mean_RNA) log2(Mean_RNA_V4) log2(Mean_RNA_V5)
V4 + V5 V4 + V5 w GC V4 V4 w GC V5 V5 w GC
(1) (2) (3) (4) (5) (6)
log2(Mean_DNA) 0.951*** 0.941***
p = 0.000 p = 0.000
GC_content 0.442 4.781*** -1.518***
p = 0.301 p = 0.000 p = 0.0004
log2(Mean_DNA_V4) 1.280*** 1.176***
p = 0.000 p = 0.000
log2(Mean_DNA_V5) 0.767*** 0.800***
p = 0.000 p = 0.000
Constant -0.695** -0.996** 2.294*** -0.959 -2.447*** -1.418***
p = 0.045 p = 0.028 p = 0.00001 p = 0.109 p = 0.000 p = 0.002
Observations 809 809 809 809 809 809
R2 0.494 0.495 0.466 0.512 0.421 0.430
Akaike Inf. Crit. 2,294.600 2,295.500 2,771.300 2,699.700 2,286.200 2,275.300
Bayesian Inf. Crit. 2,308.700 2,314.300 2,785.400 2,718.500 2,300.200 2,294.100
Note: p<0.1; p<0.05; p<0.01

5.3 V4 vs V5 ratiometric and residuals

y_ASD1k_lm_GC <- na.omit(y_ASD1k_lm_GC)

p1 <- ggplot(y_ASD1k_lm_GC, aes(x = log2(MeanRatio_V4), y = log2(MeanRatio_V5)))+
    geom_point(alpha = 0.5)+
    theme_bw()+
    geom_hline(yintercept = 0)+
   geom_vline(xintercept = 0)+
    labs(title = "V4 vs V5 ratiometric activity")+
    geom_smooth(formula = formula, method = 'lm', color = "black")+
    stat_poly_eq(formula = formula, 
                aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
                parse = TRUE)+
    theme(plot.title = element_text(hjust = 0.5, size = 14),
        text = element_text(size = 12))


y_ASD1k_lm_GC$V4_res <- lm_V4$residuals
y_ASD1k_lm_GC$V5_res <- lm_V5$residuals
y_ASD1k_lm_GC$V4_res_GC <- lm_V4_GC$residuals
y_ASD1k_lm_GC$V5_res_GC <- lm_V5_GC$residuals


p2 <- ggplot(y_ASD1k_lm_GC, aes(x = V4_res, y = V5_res))+
    geom_point(alpha = 0.5)+
    theme_bw()+
    geom_hline(yintercept = 0)+
   geom_vline(xintercept = 0)+
    labs(title = "V4 vs V5 residuals in models without GC")+
    geom_smooth(formula = formula, method = 'lm', color = "black")+
    stat_poly_eq(formula = formula, 
                aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
                parse = TRUE)+
    theme(plot.title = element_text(hjust = 0.5, size = 14),
        text = element_text(size = 12)) 


p3 <- ggplot(y_ASD1k_lm_GC, aes(x = V4_res_GC, y = V5_res_GC))+
    geom_point(alpha = 0.5)+
    theme_bw()+
    geom_hline(yintercept = 0)+
   geom_vline(xintercept = 0)+
    labs(title = "V4 vs V5 residuals in models with GC")+
    geom_smooth(formula = formula, method = 'lm', color = "black")+
    stat_poly_eq(formula = formula, 
                aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
                parse = TRUE)+
    theme(plot.title = element_text(hjust = 0.5, size = 14),
        text = element_text(size = 12))     



plot_grid(p1, p2, p3,
          nrow = 1,
          labels = c("A", "B", "C"), 
          vjust = 0.9)